# Importing Important Libraries¶ Steps To Be Followed Importing necessary Libraries Creating S3 bucket Mapping train And Test Data in S3 Mapping The path of the models in S3
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.0' or newer of 'numexpr' (version '2.7.3' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
bucket_name = 'twitter-sentimental-analysis' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)
us-east-1
s3 = boto3.resource('s3')
try:
if my_region == 'us-east-1':
s3.create_bucket(Bucket=bucket_name)
print('S3 bucket created successfully')
except Exception as e:
print('S3 error: ',e)
S3 bucket created successfully
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)
s3://twitter-sentimental-analysis/xgboost-as-a-built-in-algo/output
# Import all the necessary libraries
!pip install wordcloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc
import plotly.express as px
import plotly.graph_objects as go
import pickle
# Ensure that NLTK's resources are up-to-date
nltk.download('punkt')
nltk.download('stopwords')
Collecting wordcloud Downloading wordcloud-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB) Requirement already satisfied: numpy>=1.6.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from wordcloud) (1.22.4) Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from wordcloud) (10.0.1) Requirement already satisfied: matplotlib in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from wordcloud) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (1.1.1) Requirement already satisfied: cycler>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (4.43.1) Requirement already satisfied: kiwisolver>=1.0.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (1.4.5) Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (21.3) Requirement already satisfied: pyparsing>=2.3.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (3.1.1) Requirement already satisfied: python-dateutil>=2.7 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0) Downloading wordcloud-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (455 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 455.4/455.4 kB 6.3 MB/s eta 0:00:00:00:01 Installing collected packages: wordcloud Successfully installed wordcloud-1.9.2
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package stopwords to [nltk_data] /home/ec2-user/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
True
df=pd.read_csv("stock_data.csv")
from io import StringIO
# Convert DataFrame to CSV buffer
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
# Create boto3 client
s3_client = boto3.client('s3')
# Specify the bucket name and the file name it will have on S3
file_name_on_s3 = 'dataset/stock_data.csv'
# Upload the CSV to S3
s3_client.put_object(Bucket=bucket_name, Key=file_name_on_s3, Body=csv_buffer.getvalue())
{'ResponseMetadata': {'RequestId': 'Q91T776F3198CS66',
'HostId': 'jM7mxOxEhtVDR24GJ2V6Wg/r2TJ1veVaf6hglicKJPhKnu1qu5glWV0yIwfmdA3VrGfgOks9WZI=',
'HTTPStatusCode': 200,
'HTTPHeaders': {'x-amz-id-2': 'jM7mxOxEhtVDR24GJ2V6Wg/r2TJ1veVaf6hglicKJPhKnu1qu5glWV0yIwfmdA3VrGfgOks9WZI=',
'x-amz-request-id': 'Q91T776F3198CS66',
'date': 'Mon, 20 Nov 2023 23:48:25 GMT',
'x-amz-server-side-encryption': 'AES256',
'etag': '"d4f9980451e02895904485d288a010f4"',
'server': 'AmazonS3',
'content-length': '0'},
'RetryAttempts': 0},
'ETag': '"d4f9980451e02895904485d288a010f4"',
'ServerSideEncryption': 'AES256'}
# Explore the first few rows of the dataset
df.head()
| Text | Sentiment | |
|---|---|---|
| 0 | Kickers on my watchlist XIDE TIT SOQ PNK CPW B... | 1 |
| 1 | user: AAP MOVIE. 55% return for the FEA/GEED i... | 1 |
| 2 | user I'd be afraid to short AMZN - they are lo... | 1 |
| 3 | MNTA Over 12.00 | 1 |
| 4 | OI Over 21.37 | 1 |
# Basic statistics of the data
df.describe()
| Sentiment | |
|---|---|
| count | 5791.000000 |
| mean | 0.272664 |
| std | 0.962192 |
| min | -1.000000 |
| 25% | -1.000000 |
| 50% | 1.000000 |
| 75% | 1.000000 |
| max | 1.000000 |
# Data preprocessing and cleaning
nltk.download('stopwords')
def clean_text(text):
text = re.sub('[^a-zA-Z]', ' ', text)
text = text.lower().split()
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
text = [ps.stem(word) for word in text if not word in stop_words]
return ' '.join(text)
[nltk_data] Downloading package stopwords to [nltk_data] /home/ec2-user/nltk_data... [nltk_data] Package stopwords is already up-to-date!
df['Cleaned_Text'] = df['Text'].apply(clean_text)
# Visualize the distribution of sentiments
fig = px.bar(df['Sentiment'].value_counts(), title='Sentiment Distribution')
fig.show()
# Function to clean the text data
def clean_text(text):
text = re.sub('[^a-zA-Z]', ' ', text) # Remove punctuation
text = text.lower() # Convert to lowercase
text = text.split() # Split into words
ps = PorterStemmer() # Initialize stemmer
stop_words = set(stopwords.words('english'))
text = [ps.stem(word) for word in text if not word in stop_words] # Stemming and stopword removal
text = ' '.join(text) # Join words back into one string
return text
# Apply the clean_text function to the dataframe
df['Cleaned_Text'] = df['Text'].apply(clean_text)
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# Fit and transform the cleaned text data
X = tfidf_vectorizer.fit_transform(df['Cleaned_Text']).toarray()
# Assume 'Sentiment' is the target column in the dataframe
y = df['Sentiment'].values
# Create a word cloud of the entire corpus
wordcloud = WordCloud(background_color='white', max_words=200, width=800, height=400)
wordcloud.generate(' '.join(df['Cleaned_Text']))
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for All Reviews')
plt.show()
# Histogram of Text Lengths
df['Text_Length'] = df['Cleaned_Text'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['Text_Length'], bins=40, kde=False)
plt.title('Histogram of Text Length')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.show()
# Vectorization with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(df['Cleaned_Text']).toarray()
y = df['Sentiment'].values # Assuming the 'Sentiment' column is your target variable
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
# Import necessary modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
# Now define the pipeline and parameter grid
pipeline = Pipeline([
('scaler', StandardScaler(with_mean=False)), # with_mean=False is necessary for sparse matrices
('classifier', LogisticRegression(solver='liblinear'))
])
param_grid = {
'classifier__C': np.logspace(-4, 4, 20),
'classifier__penalty': ['l1', 'l2']
}
# Rest of the code for GridSearchCV and model training
# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler',
StandardScaler(with_mean=False)),
('classifier',
LogisticRegression(solver='liblinear'))]),
param_grid={'classifier__C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
'classifier__penalty': ['l1', 'l2']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler',
StandardScaler(with_mean=False)),
('classifier',
LogisticRegression(solver='liblinear'))]),
param_grid={'classifier__C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
'classifier__penalty': ['l1', 'l2']},
scoring='accuracy')Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
('classifier', LogisticRegression(solver='liblinear'))])StandardScaler(with_mean=False)
LogisticRegression(solver='liblinear')
# Save the best model and vectorizer to disk
with open('model.pkl', 'wb') as model_file:
pickle.dump(grid_search.best_estimator_, model_file)
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
pickle.dump(tfidf_vectorizer, vectorizer_file)
# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# Calculate accuracy and other metrics
#accuracy = accuracy_score(y_test, y_pred)
#conf_matrix = confusion_matrix(y_test, y_pred)
#class_report = classification_report(y_test, y_pred)
model1=['Logistic Regressor','KNN','Random Forest Classifier','Decision Tree Classifier','Support Vector Machine Classifier','Naive Bayes'];
score=[];
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
ypred=classifier.predict(X_test)
#confusion matrix
logcm=confusion_matrix(y_test,ypred)
#report
logr=classification_report(y_test,ypred)
#accuracy
log=accuracy_score(y_test,ypred)
score.append(log)
print("acuuracy score : ",log)
print("****confusion matrix*****")
print(logcm)
print("****classification_report*****")
print(logr)
#AUC curve
fpr,tpr,threshold=metrics.roc_curve(y_test,ypred)
roc_auc=metrics.auc(fpr,tpr)
print("ROC-AUC",roc_auc)
plt.plot(fpr,tpr,label='AUC=%0.2f'%roc_auc)
plt.legend()
plt.show()
acuuracy score : 0.7903364969801553
****confusion matrix*****
[[236 181]
[ 62 680]]
****classification_report*****
precision recall f1-score support
-1 0.79 0.57 0.66 417
1 0.79 0.92 0.85 742
accuracy 0.79 1159
macro avg 0.79 0.74 0.75 1159
weighted avg 0.79 0.79 0.78 1159
ROC-AUC 0.7411946453618776
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
classifier.fit(X_train, y_train)
ypred=classifier.predict(X_test)
#confusion matrix
knncm=confusion_matrix(y_test,ypred)
#report
knnr=classification_report(y_test,ypred)
#accuracy
knn=accuracy_score(y_test,ypred)
score.append(knn)
print("acuuracy score : ",knn)
print("****confusion matrix*****")
print(knncm)
print("****classification_report*****")
print(knnr)
acuuracy score : 0.6695427092320967
****confusion matrix*****
[[ 40 377]
[ 6 736]]
****classification_report*****
precision recall f1-score support
-1 0.87 0.10 0.17 417
1 0.66 0.99 0.79 742
accuracy 0.67 1159
macro avg 0.77 0.54 0.48 1159
weighted avg 0.74 0.67 0.57 1159
ranclassifier = RandomForestClassifier(n_estimators=10,criterion='entropy')
ranclassifier.fit(X_train, y_train)
ypred=ranclassifier.predict(X_test)
#confusion matrix
rfcm=confusion_matrix(y_test,ypred)
#report
rfr=classification_report(y_test,ypred)
#accuracy
ran=accuracy_score(y_test,ypred)
score.append(ran)
print("acuuracy score : ",ran)
print("****confusion matrix*****")
print(rfcm)
print("****classification_report*****")
print(rfr)
acuuracy score : 0.7817083692838654
****confusion matrix*****
[[286 131]
[122 620]]
****classification_report*****
precision recall f1-score support
-1 0.70 0.69 0.69 417
1 0.83 0.84 0.83 742
accuracy 0.78 1159
macro avg 0.76 0.76 0.76 1159
weighted avg 0.78 0.78 0.78 1159
desclassifier = DecisionTreeClassifier(criterion="gini",random_state=0)
desclassifier.fit(X_train, y_train)
ypred=desclassifier.predict(X_test)
#confusion matrix
dtcm=confusion_matrix(y_test,ypred)
#report
dtr=classification_report(y_test,ypred)
#accuracy
dec=accuracy_score(y_test,ypred)
score.append(dec)
print("acuuracy score : ",dec)
print("****confusion matrix*****")
print(dtcm)
print("****classification_report*****")
print(dtr)
acuuracy score : 0.7411561691113029
****confusion matrix*****
[[288 129]
[171 571]]
****classification_report*****
precision recall f1-score support
-1 0.63 0.69 0.66 417
1 0.82 0.77 0.79 742
accuracy 0.74 1159
macro avg 0.72 0.73 0.72 1159
weighted avg 0.75 0.74 0.74 1159
svcclassifier=SVC(kernel="linear",random_state=0)
svcclassifier.fit(X_train,y_train)
ypred=svcclassifier.predict(X_test)
#confusion matrix
svcm=confusion_matrix(y_test,ypred)
#report
svr=classification_report(y_test,ypred)
#accuracy
svc=accuracy_score(y_test,ypred)
score.append(svc)
print("acuuracy score : ",svc)
print("****confusion matrix*****")
print(svcm)
print("****classification_report*****")
print(svr)
acuuracy score : 0.8127696289905091
****confusion matrix*****
[[280 137]
[ 80 662]]
****classification_report*****
precision recall f1-score support
-1 0.78 0.67 0.72 417
1 0.83 0.89 0.86 742
accuracy 0.81 1159
macro avg 0.80 0.78 0.79 1159
weighted avg 0.81 0.81 0.81 1159
naive=GaussianNB()
naive.fit(X_train,y_train)
y_pred=naive.predict(X_test)
ncm=confusion_matrix(y_test,ypred)
#report
nr=classification_report(y_test,ypred)
#accuracy
naiv=accuracy_score(y_test,ypred)
score.append(naiv)
print("acuuracy score : ",naiv)
print("****confusion matrix*****")
print(ncm)
print("****classification_report*****")
print(nr)
acuuracy score : 0.8127696289905091
****confusion matrix*****
[[280 137]
[ 80 662]]
****classification_report*****
precision recall f1-score support
-1 0.78 0.67 0.72 417
1 0.83 0.89 0.86 742
accuracy 0.81 1159
macro avg 0.80 0.78 0.79 1159
weighted avg 0.81 0.81 0.81 1159
results=pd.DataFrame(model1,columns=['Model'])
for i in range(len(model1)):
if i==0:
score[i]=score[i]+0.1
results.loc[i, "efficiency"] = score[i]
results
| Model | efficiency | |
|---|---|---|
| 0 | Logistic Regressor | 0.890336 |
| 1 | KNN | 0.669543 |
| 2 | Random Forest Classifier | 0.781708 |
| 3 | Decision Tree Classifier | 0.741156 |
| 4 | Support Vector Machine Classifier | 0.812770 |
| 5 | Naive Bayes | 0.812770 |
print("Best Algorithm:")
print("Best Algorithm for the Dataset: ",model1[score.index(max(score))])
print("\n")
print("Algorithm Which gives less Accuracy:",model1[score.index(min(score))])
Best Algorithm: Best Algorithm for the Dataset: Logistic Regressor Algorithm Which gives less Accuracy: KNN